The aim of this script is to compare if there is a systemic bias between WGBS processed samples and targeted sequenced samples and to remove it. Also, the targeted sequencing data functions as validation cohort for the DMRs identified from the discovery cohort.
The sequencing bias could best be removed by combining a quantile normalisation with a ComBat adjustment.Further approaches e.g. winsorization have been also tested in this script.
library(tidyverse)
library(DescTools)
library(preprocessCore)
library(ggfortify)
library(sva)
Loading required package: mgcv
Loading required package: nlme
Attaching package: 'nlme'
The following object is masked from 'package:dplyr':
collapse
This is mgcv 1.8-40. For overview type 'help("mgcv-package")'.
Loading required package: genefilter
Attaching package: 'genefilter'
The following object is masked from 'package:DescTools':
AUC
The following object is masked from 'package:readr':
spec
Loading required package: BiocParallel
library(ggpubr)
library(ggrepel)
theme_Publication <- function(base_size=14, base_family="sans") {
library(grid)
library(ggthemes)
(theme_foundation(base_size=base_size, base_family=base_family)
+ theme(plot.title = element_text(face = "bold",
size = rel(1.2), hjust = 0.5, margin = margin(0,0,20,0)),
text = element_text(),
panel.background = element_rect(colour = NA),
plot.background = element_rect(colour = NA),
panel.border = element_rect(colour = NA),
axis.title = element_text(face = "bold",size = rel(1)),
axis.title.y = element_text(angle=90,vjust =2),
axis.title.x = element_text(vjust = -0.2),
axis.text = element_text(),
axis.line.x = element_line(colour="black"),
axis.line.y = element_line(colour="black"),
axis.ticks = element_line(),
panel.grid.major = element_line(colour="#f0f0f0"),
panel.grid.minor = element_blank(),
legend.key = element_rect(colour = NA),
legend.position = "bottom",
legend.direction = "horizontal",
legend.box = "vetical",
legend.key.size= unit(0.5, "cm"),
#legend.margin = unit(0, "cm"),
legend.title = element_text(face="italic"),
plot.margin=unit(c(10,5,5,5),"mm"),
strip.background=element_rect(colour="#f0f0f0",fill="#f0f0f0"),
strip.text = element_text(face="bold")
))
}
meth_perc_all_tiles_validation <- readRDS(file = "/local/rcuadrat/data_for_altuna/methyl_perc_all_tiles_no_CAD_validation.RDS")
meth_perc_all_tiles_WGBS <- readRDS(file = "/local/rcuadrat/data_for_altuna/methyl_perc_all_tiles_no_CAD_WGBS.RDS")
metadata <- readRDS(file = "/local/rcuadrat/data_for_altuna/metadata_no_CAD.RDS")
data_for_scatterplot <- readRDS(file = "/local/rcuadrat/data_for_altuna/data_for_scatterplot.rds")
methylBaseDB_validation_no_CAD <- readRDS("/local/rcuadrat/data_for_altuna/methylBaseDB_validation_no_CAD.RDS")
methylBaseDB_WGBS_all_samples <- readRDS("/local/rcuadrat/data_for_altuna/methylBaseDB_WGBS_all_samples.RDS")
methylDiff_WGBS <- readRDS(file= "/local/AAkalin_cardiac/Results/cardiac/RDS/myDiff25p_tiled_list_v1.RDS")
methylDiff_validation<- readRDS(file= "/local/rcuadrat/data_for_altuna/methylDiff_validation_ACS.RDS")
methylDiff_WGBS <- methylDiff_WGBS[1:3]
names(methylDiff_WGBS)<-c("STEMI", "NSTEMI", "UA")
methylDiff_WGBS <- methylDiff_WGBS %>% map2(names(.), ~ as_tibble(.x) %>%
add_column(condition=.y) %>%
tidyr::unite(tiles, seqnames, start, end, condition, sep="."))
methylDiff_validation <- methylDiff_validation %>% map2(names(.), ~ methylKit::getData(.x) %>%
as_tibble(.x) %>%
add_column(condition=.y) %>%
tidyr::unite(tiles, chr, start, end, condition, sep="."))
clinical_markers<-readxl::read_excel("/local/AAkalin_cardiac/metadata/WGBS_ShortTableCorrelations.xlsx")
suppl_tbl_2 <- readRDS("/local/rcuadrat/cfdna_wgbs/suppl_tbl_2.RDS")
clinical_markers<-merge(suppl_tbl_2,clinical_markers,by="Sample")
#clinical_markers = clinical_markers %>% rename("CRP (mg/dl)"="CRP(mg/dl_<5)","TroponinT (ng/l)"="TroponinThs_t1 (ng/l <14or<50)","CK (U/l)"="CK (U/l _<190)","CK max (U/l)"="CKmax(U/l_<190)","CK-MB (U/l)"="CK-MB(U/l <24)","CK-MB max (U/l)"="CK-MB_Max")
clinical_markers_ACS<-clinical_markers %>% filter(treatment !=0)
seq_method <- rep(c("WGBS", "targeted"), times = c(29, 11))
metadata$seq_method <- seq_method
metadata <- metadata %>%
mutate(condition = case_when(
(treatment == 0) ~ "Control",
(treatment == 1) ~ "STEMI",
(treatment == 2) ~ "NSTEMI",
(treatment == 3) ~ "UA"
))
nrow(meth_perc_all_tiles_validation)
[1] 10138
length(setdiff(rownames(meth_perc_all_tiles_validation), rownames(meth_perc_all_tiles_WGBS)))
[1] 1026
length(intersect(rownames(meth_perc_all_tiles_validation), rownames(meth_perc_all_tiles_WGBS)))
[1] 9112
all_common_tiles<-intersect(rownames(meth_perc_all_tiles_validation), rownames(meth_perc_all_tiles_WGBS))
Find out if tiles which do not match WGBS are mapping to specific location Altuna:“…we have designed probes for 10K regions but for whatever reason some probes don’t work that’s why targeted sequencing doesn’t cover whole 10K tiles but same tiles exist on WGBS because that was the basis of our design. losing probes are expected, the efficiency of targeting is 80-90%”
missing_tiles <- setdiff(rownames(meth_perc_all_tiles_validation), rownames(meth_perc_all_tiles_WGBS))
missing_tiles <- tibble(missing_tiles) %>%
separate(missing_tiles, into = c("chrom", "start", "end"), sep = "\\.") %>%
arrange(chrom, start, end)
Map corresponding tiles of both seq methods in one
overlap_tiles_validation_WGBS <- inner_join(x=as_tibble(meth_perc_all_tiles_validation, rownames = "tiles"), y=as_tibble(meth_perc_all_tiles_WGBS, rownames = "tiles"))
Joining, by = "tiles"
Transform to long format
all_data <- overlap_tiles_validation_WGBS %>%
pivot_longer(-tiles, names_to = "sample_ids", values_to = "perc_meth") %>%
left_join(metadata, by = "sample_ids") %>%
mutate(treatment = as.factor(treatment))
# Map metadata and pivot
overlap_tiles_validation_WGBS_mean <- overlap_tiles_validation_WGBS %>%
pivot_longer(-tiles, names_to = "sample_ids", values_to = "perc_meth") %>%
left_join(metadata, by = "sample_ids") %>%
mutate(treatment = as.factor(treatment))
#calculate mean per condition
overlap_tiles_validation_WGBS_mean <- overlap_tiles_validation_WGBS_mean %>%
group_by(tiles, seq_method, condition) %>%
summarise_at(vars(perc_meth), list(name = mean))
overlap_tiles_validation_WGBS_mean <-
dplyr::rename(overlap_tiles_validation_WGBS_mean, mean = "name")
#calculate difference of means
overlap_tiles_validation_WGBS_mean_diff <- overlap_tiles_validation_WGBS_mean %>%
pivot_wider(values_from = mean, names_from =condition) %>%
mutate(meth.diff_STEMI = STEMI - Control,
meth.diff_NSTEMI = NSTEMI - Control,
meth.diff_UA = UA - Control) %>%
dplyr::select(-c(Control, NSTEMI, STEMI, UA)) %>%
dplyr::rename(STEMI = "meth.diff_STEMI",
NSTEMI ="meth.diff_NSTEMI",
UA = "meth.diff_UA") %>%
pivot_longer(-c(tiles, seq_method), names_to = "condition", values_to = "meth_diff" ) %>%
pivot_wider(names_from = "seq_method", values_from ="meth_diff") %>%
dplyr::rename(meth.diff.WGBS = "WGBS", meth.diff.targeted = "targeted")
#Calculate threshold for scatterplot
overlap_tiles_validation_WGBS_mean_diff <- overlap_tiles_validation_WGBS_mean_diff %>%
mutate(threshold_25= case_when((((25 <= meth.diff.WGBS) & (25 <= meth.diff.targeted)) | (((meth.diff.WGBS <=-25) & (meth.diff.targeted <=-25))) ) ~ "|25%|", TRUE ~ "not in range"))
# Plot
overlap_tiles_validation_WGBS_mean_diff %>%
ggplot(aes(x = meth.diff.targeted, y = meth.diff.WGBS, colour=threshold_25)) +
geom_point(shape = 16, size=2, alpha=0.6) +
stat_cor(method="pearson")+
geom_vline(xintercept = c(-25, 25), linetype="dotted")+
geom_hline(yintercept = c(-25, 25), linetype="dotted")+
ylab("Mean Methylation Difference WGBS") +
xlab("Mean Methylation Difference Targeted Sequencing") +
labs(title = "Raw Data \nAll Common Tiles", colour="Threshold") +
theme(plot.title = element_text(hjust = 0.5, face ="bold", size=16), aspect.ratio = 1) +
scale_color_manual(values=c("#919494", "#7CAE00"))+
facet_grid(cols = vars(condition))
p28 <-
overlap_tiles_validation_WGBS_mean_diff %>%
ggplot(aes(x = condition, y = meth.diff.targeted, fill = condition)) +
geom_boxplot(outlier.shape = NA) +
ylab("Mean Methylation Difference [%]") +
xlab("ACS Type") +
labs(title = "Targeted Sequencing") +
scale_fill_discrete(name = "ACS Type") +
theme(plot.title = element_text(hjust = 0.5)) +
ylim(-100, 100)
p29 <-
overlap_tiles_validation_WGBS_mean_diff %>%
ggplot(aes(x = condition, y = meth.diff.WGBS, fill = condition)) +
geom_boxplot(outlier.shape = NA) +
ylab("Mean Methylation Difference [%]") +
xlab("ACS Type") +
labs(title = "WGBS") +
scale_fill_discrete(name = "ACS Type") +
ylim(-100, 100) +
theme(plot.title = element_text(hjust = 0.5))
annotate_figure(ggarrange(p28, p29, nrow=1, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Raw Data \nAll Common Tiles", face = "bold", size=16))
tiles <- unlist(overlap_tiles_validation_WGBS[, 1])
overlap_tiles_validation_WGBS_winsorized <- as_tibble(lapply(overlap_tiles_validation_WGBS[, -1], Winsorize, probs = c(0.1, 0.9))) %>%
add_column(tiles = tiles, .before = "2017") %>%
pivot_longer(-tiles, names_to = "sample_ids", values_to = "perc_meth") %>%
left_join(metadata, by = "sample_ids") %>%
mutate(treatment = as.factor(treatment))
# Plot data to compare methylation between treated diseases
p8 <- overlap_tiles_validation_WGBS_winsorized %>%
ggplot(aes(x = condition, y = perc_meth, fill = seq_method)) +
geom_boxplot() +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5))
p9 <- overlap_tiles_validation_WGBS_winsorized %>%
ggplot(aes(x = condition, y = perc_meth, fill = seq_method)) +
geom_violin() +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5))
p10 <- overlap_tiles_validation_WGBS_winsorized %>% ggplot(aes(x = condition, y = perc_meth)) +
geom_violin(aes(fill = seq_method)) +
geom_boxplot(aes(fill = seq_method), width = 0.05) +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5)) +
facet_grid(seq_method ~ .)
p11 <- overlap_tiles_validation_WGBS_winsorized %>% ggplot(aes(x = condition, y = perc_meth)) +
geom_violin(fill = "#7C7CFF") +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5))
annotate_figure(ggarrange(p8, p9, p10, p11, common.legend = TRUE, legend="right", labels="AUTO", nrow=2, ncol=2), text_grob("Windsorized Data \nAll Common Tiles", face = "bold", size=16))
p12 <- overlap_tiles_validation_WGBS_winsorized %>%
filter(seq_method == "WGBS") %>%
ggplot(aes(x = sample_ids, y = perc_meth)) +
geom_violin(fill = "#F8766D") +
geom_boxplot(width = 0.05) +
ylab("Methylation [%]") +
xlab("Sample") +
labs(title = "WGBS") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, vjust = 0.8, hjust = 1))
p13 <- overlap_tiles_validation_WGBS_winsorized %>%
filter(seq_method == "targeted") %>%
ggplot(aes(x = sample_ids, y = perc_meth)) +
geom_violin(fill = "#00BFC4") +
geom_boxplot(width = 0.05) +
ylab("Methylation [%]") +
xlab("Sample") +
labs(title = "Targeted") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5))
overlap_tiles_validation_WGBS_winsorized %>% ggplot(aes(x = sample_ids, y = perc_meth, fill = seq_method)) +
geom_violin() +
geom_boxplot(width = 0.05) +
ylab("Methylation [%]") +
xlab("Sample") +
labs(title = "Windsorized Data\nAll Common Tiles") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5, face="bold", size=16), axis.text.x = element_text(angle = 45, vjust = 0.8, hjust = 1))
annotate_figure(ggarrange(p12, p13, nrow=2, ncol=1, labels="AUTO"), text_grob("Windsorized Data \nAll Common Tiles", face = "bold", size=16))
# Seperate targeted and WGBS data into two tibbles
targeted_meth_perc <- overlap_tiles_validation_WGBS %>% dplyr::select((metadata %>% filter(seq_method == "targeted"))$sample_ids)
WGBS_meth_perc <- overlap_tiles_validation_WGBS %>% dplyr::select((metadata %>% filter(seq_method == "WGBS"))$sample_ids)
# make WGBS data a vector
WGBS_meth_perc_distribution <- c(t(WGBS_meth_perc))
# normalise and assign column names to data
targeted_meth_perc_normalised <- normalize.quantiles.use.target(as.matrix(targeted_meth_perc), WGBS_meth_perc_distribution, copy = TRUE) %>% as_tibble()
colnames(targeted_meth_perc_normalised) <- colnames(targeted_meth_perc)
# add tiles column and join both dataframes
targeted_meth_perc_normalised <- targeted_meth_perc_normalised %>%
add_column(tiles = tiles, .before = "2017")
WGBS_meth_perc <- WGBS_meth_perc %>%
add_column(tiles = tiles, .before = "N1")
overlap_normalised <- WGBS_meth_perc %>% left_join(targeted_meth_perc_normalised, by = "tiles")
# map metadata and pivot
overlap_normalised <- overlap_normalised %>%
pivot_longer(-tiles, names_to = "sample_ids", values_to = "perc_meth") %>%
left_join(metadata, by = "sample_ids") %>%
mutate(treatment = as.factor(treatment))
p14 <- overlap_normalised %>% ggplot(aes(x = condition, y = perc_meth, fill = seq_method)) +
geom_boxplot() +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5))
p15 <- overlap_normalised %>% ggplot(aes(x = condition, y = perc_meth, fill = seq_method)) +
geom_violin() +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5))
p16 <- overlap_normalised %>% ggplot(aes(x = condition, y = perc_meth)) +
geom_violin(aes(fill = seq_method)) +
geom_boxplot(aes(fill = seq_method), width = 0.05) +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5)) +
facet_grid(seq_method ~ .)
p17 <- overlap_normalised %>% ggplot(aes(x = condition, y = perc_meth)) +
geom_violin(fill = "#7C7CFF") +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5))
annotate_figure(ggarrange(p14, p15, p16, p17, nrow=2, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Quantile Normalised Data \nAll Common Tiles", face = "bold", size=16))
p18 <- overlap_normalised %>%
filter(seq_method == "targeted") %>%
ggplot(aes(x = sample_ids, y = perc_meth)) +
geom_violin(fill = "#00BFC4") +
geom_boxplot(width = 0.05) +
ylab("Methylation [%]") +
xlab("Sample") +
labs(title = "Targeted Sequencing") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, vjust = 0.8, hjust = 1))
p19 <- overlap_normalised %>%
filter(seq_method == "WGBS") %>%
ggplot(aes(x = sample_ids, y = perc_meth)) +
geom_violin(fill = "#F8766D") +
geom_boxplot(width = 0.05) +
ylab("Methylation [%]") +
xlab("Sample") +
labs(title = "WGBS") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 45, vjust = 0.8, hjust = 1))
overlap_normalised %>% ggplot(aes(x = sample_ids, y = perc_meth, fill = seq_method)) +
geom_violin() +
geom_boxplot(width = 0.05) +
ylab("Methylation [%]") +
xlab("Sample") +
labs(title = "Quantile Normalised Data") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5, face="bold", size=16), axis.text.x = element_text(angle = 45, vjust = 0.8, hjust = 1))
annotate_figure(ggarrange(p18, p19, nrow=2, ncol=1, labels="AUTO"), text_grob("Quantile Normalised Data \nAll Common Tiles", face = "bold", size=16))
overlap_normalised_mean <- overlap_normalised %>%
group_by(tiles, seq_method, condition) %>%
summarise_at(vars(perc_meth), list(name = mean))
overlap_normalised_mean <- dplyr::rename(overlap_normalised_mean, mean = "name")
overlap_normalised_mean_diff <- overlap_normalised_mean %>%
pivot_wider(values_from = mean, names_from = condition) %>%
mutate(
meth.diff_STEMI = STEMI - Control,
meth.diff_NSTEMI = NSTEMI - Control,
meth.diff_UA = UA - Control
) %>%
dplyr::select(-c(Control, NSTEMI, STEMI, UA)) %>%
dplyr::rename(STEMI = "meth.diff_STEMI", NSTEMI = "meth.diff_NSTEMI", UA = "meth.diff_UA") %>%
pivot_longer(-c(tiles, seq_method),
names_to = "condition",
values_to = "meth_diff"
) %>%
pivot_wider(names_from = "seq_method", values_from = "meth_diff") %>%
dplyr::rename(meth.diff.WGBS = "WGBS", meth.diff.targeted = "targeted")
data_for_scatterplot <- data_for_scatterplot %>% tidyr::unite(tiles, seqnames, start, end, group, sep = ".")
overlap_DM_data_normalised_mean_diff <- overlap_normalised_mean_diff %>%
tidyr::unite(tiles, tiles, condition, sep = ".") %>%
filter(tiles %in% c(data_for_scatterplot$tiles))
data_for_scatterplot <- data_for_scatterplot %>% tidyr::separate(tiles, into = c("seqnames", "start", "end", "group"), sep = "\\.")
overlap_DM_data_normalised_mean_diff <- overlap_DM_data_normalised_mean_diff %>% tidyr::separate(tiles, into = c("chrom", "start", "end", "condition"), sep = "\\.")
Compare relationship between raw and normalised data and seq technique
#Calculate threshold for scatterplot
data_for_scatterplot <- data_for_scatterplot %>%
mutate(threshold_25= case_when((((25 <= meth.diff.x) & (25 <= meth.diff.y)) | (((meth.diff.x <=-25) & (meth.diff.y <=-25))) ) ~ "|25%|", TRUE ~ "not in range"))
overlap_DM_data_normalised_mean_diff <- overlap_DM_data_normalised_mean_diff %>%
mutate(threshold_25= case_when((((25 <= meth.diff.targeted) & (25 <= meth.diff.WGBS)) | (((meth.diff.targeted <=-25) & (meth.diff.WGBS <=-25))) ) ~ "|25%|", TRUE ~ "not in range"))
p20 <- data_for_scatterplot %>%
ggplot(aes(x = meth.diff.x, y = meth.diff.y, colour=threshold_25)) +
geom_point(shape = 16, size=2, alpha=0.6) +
geom_vline(xintercept = c(-25, 25), linetype="dotted")+
geom_hline(yintercept = c(-25, 25), linetype="dotted")+
ylab("Weighted Mean Methylation \nDifference Y") +
xlab("Weighted Mean Methylation \nDifference X") +
labs(title = "Raw Data \nCoverage Weighted", colour="Threshold") +
stat_cor(method="pearson")+
ylim(-100, 100) +
xlim(-100, 100) +
theme(plot.title = element_text(hjust = 0.5, face="bold", size=16), aspect.ratio = 1) +
scale_color_manual(values=c("#919494", "#7CAE00"))+
facet_grid(cols = vars(group))
p21 <- overlap_DM_data_normalised_mean_diff %>%
ggplot(aes(x = meth.diff.targeted, y = meth.diff.WGBS, colour=threshold_25)) +
geom_point(shape = 16, size=2, alpha=0.6) +
geom_vline(xintercept = c(-25, 25), linetype="dotted")+
geom_hline(yintercept = c(-25, 25), linetype="dotted")+
ylab("Mean Methylation Difference WGBS") +
xlab("Mean Methylation Difference \nTargeted Sequencing") +
labs(title = "Quantile Normalised Data", colour="Threshold") +
stat_cor(method="pearson")+
ylim(-100, 100) +
xlim(-100, 100) +
theme(plot.title = element_text(hjust = 0.5, face="bold", size=16), aspect.ratio = 1) +
scale_color_manual(values=c( "#919494", "#7CAE00"))+
facet_grid(cols = vars(condition))
annotate_figure(ggarrange(p20, p21, labels="AUTO", nrow=2, ncol=1), text_grob("DM Tiles", face = "bold", size=16))
Use weighted methylation differences for WGBS data
data_for_scatterplot <- data_for_scatterplot %>% tidyr::unite(tiles, seqnames, start, end, group, sep = ".")
diff_meth_tiles <- data_for_scatterplot$tiles
diff.meth.y.weighted <- data_for_scatterplot %>% dplyr::select(tiles, meth.diff.y)
overlap_DM_data_normalised_mean_diff <- overlap_DM_data_normalised_mean_diff %>%
tidyr::unite(tiles, chrom, start, end, condition, sep = ".") %>%
left_join(diff.meth.y.weighted, by = "tiles") %>%
tidyr::separate(tiles, into = c("chrom", "start", "end", "condition"), sep = "\\.")
overlap_DM_data_normalised_mean_diff <- overlap_DM_data_normalised_mean_diff %>%
mutate(threshold_25_y_weighted= case_when((((25 <= meth.diff.targeted) & (25 <= meth.diff.y)) | (((meth.diff.targeted <=-25) & (meth.diff.y <=-25))) ) ~ "|25%|", TRUE ~ "not in range"))
p22 <- overlap_DM_data_normalised_mean_diff %>% ggplot(aes(x = meth.diff.targeted, y = meth.diff.y, colour=threshold_25_y_weighted)) +
geom_point(shape = 16, size=2, alpha=0.6) +
geom_vline(xintercept = c(-25, 25), linetype="dotted")+
geom_hline(yintercept = c(-25, 25), linetype="dotted")+
ylab("Weighted Mean Methylation \nDifference WGBS") +
xlab("Mean Methylation Difference \nTargeted Sequencing") +
labs(title = "Quantile Normalised Data", colour="Threshold") +
stat_cor(method="pearson")+
ylim(-100, 100) +
xlim(-100, 100) +
theme(plot.title = element_text(hjust = 0.5, face="bold", size=16), aspect.ratio = 1) +
scale_color_manual(values=c("#919494", "#7CAE00"))+
facet_grid(cols = vars(condition))
annotate_figure(ggarrange(p20, p22, labels="AUTO", nrow=2, ncol=1), text_grob("DM Tiles", face = "bold", size=16))
data_for_scatterplot <- data_for_scatterplot %>% separate(tiles, into = c("seqnames", "start", "end", "group"), sep = "\\.")
raw_validated <- data_for_scatterplot %>%
group_by(group) %>%
dplyr::count(meth.diff.x <=-25 | meth.diff.x >=25) %>%
dplyr::rename(threshold_25=2) %>%
pivot_wider(names_from=threshold_25, values_from=n) %>%
dplyr::rename(Validated=3, Not_Validated=2) %>%
mutate(percentage=(Validated/(Not_Validated+Validated)*100))
print(raw_validated)
norm_validated <- overlap_DM_data_normalised_mean_diff %>%
group_by(condition) %>%
dplyr::count(meth.diff.targeted <=-25 | meth.diff.targeted >=25) %>%
dplyr::rename(threshold_25=2) %>%
pivot_wider(names_from=threshold_25, values_from=n) %>%
dplyr::rename(Validated=3, Not_Validated=2) %>%
mutate(percentage=(Validated/(Not_Validated+Validated)*100))
print(norm_validated)
p23 <- overlap_DM_data_normalised_mean_diff %>%
ggplot(aes(x = condition, y = meth.diff.targeted, fill = condition)) +
geom_boxplot() +
ylab("Mean Methylation Difference [%]") +
xlab("ACS Type") +
labs(title = "Targeted Sequencing") +
scale_fill_discrete(name = "ACS Type") +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
ylim(-100, 80)
p24 <- overlap_DM_data_normalised_mean_diff %>%
ggplot(aes(x = condition, y = meth.diff.WGBS, fill = condition)) +
geom_boxplot() +
ylab("Mean Methylation Difference [%]") +
xlab("ACS Type") +
labs(title = "WGBS") +
scale_fill_discrete(name = "ACS Type") +
ylim(-100, 80) +
theme(plot.title = element_text(hjust = 0.5, face="bold"))
p25 <- overlap_DM_data_normalised_mean_diff %>%
ggplot(aes(x = condition, y = meth.diff.y, fill = condition)) +
geom_boxplot() +
ylab("Weighted Mean Methylation Difference [%]") +
xlab("ACS Type") +
labs(title = "Coverage Weighted \nWGBS") +
scale_fill_discrete(name = "ACS Type") +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
ylim(-100, 80)
annotate_figure(ggarrange(p23, p24, p25, nrow=1, ncol=3, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Quantile Normalised Data \nDM Tiles", face = "bold", size=16))
p26 <- overlap_normalised_mean_diff %>% ggplot(aes(x = condition, y = meth.diff.targeted, fill = condition)) +
geom_boxplot() +
ylab("Mean Methylation Difference [%]") +
xlab("ACS Type") +
labs(title = "Targeted Sequencing") +
scale_fill_discrete(name = "ACS Type") +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
ylim(-100, 100)
p27 <- overlap_normalised_mean_diff %>% ggplot(aes(x = condition, y = meth.diff.WGBS, fill = condition)) +
geom_boxplot() +
ylab("Mean Methylation Difference [%]") +
xlab("ACS Type") +
labs(title = "WGBS") +
scale_fill_discrete(name = "ACS Type") +
ylim(-100, 100) +
theme(plot.title = element_text(hjust = 0.5, face="bold") )
annotate_figure(ggarrange(p26, p27, nrow=1, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Quantile Normalised Data \nAll Common Tiles", face = "bold", size=16))
overlap_normalised_pca <- overlap_normalised %>%
group_by(condition)
overlap_normalised_pca <- group_split(overlap_normalised_pca)
names(overlap_normalised_pca) <- c("Control", "NSTEMI", "STEMI", "UA")
overlap_normalised_pca <- overlap_normalised_pca %>%
map(~ dplyr::select(., tiles, perc_meth, sample_ids, seq_method) %>%
pivot_wider(names_from = tiles, values_from = perc_meth))%>%
map2(names(.), ~ autoplot(prcomp(dplyr::select(.x, starts_with("chr"))), data = .x, colour = "seq_method") +
labs(title = .y, colour="Sequencing method") +
theme(plot.title = element_text(hjust = 0.5, face="bold"), , aspect.ratio = 1))
annotate_figure(ggarrange(plotlist = overlap_normalised_pca, nrow=2, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Quantile Normalised Data \nAll Common Tiles", face = "bold", size=16))
–> the seq bias is still visible even though quantile normalisation
log_fun <- function(observed) {
result <- (log((observed + 1) / ((100 - observed) + 1)))
}
overlap_log <- overlap_tiles_validation_WGBS %>%
dplyr::select_if(is.numeric) %>%
mutate_all(list(log = ~ log_fun(.))) %>%
inner_join(overlap_tiles_validation_WGBS) %>%
dplyr::select(-c(1:40)) %>%
relocate("tiles") %>%
pivot_longer(-tiles, names_to = "sample_ids", values_to = "log_transf_perc_meth") %>%
mutate(across("sample_ids", str_replace, "_log", "")) %>%
left_join(metadata %>% dplyr::select(sample_ids, condition, seq_method), by = "sample_ids")
Joining, by = c("2017", "2018", "2019", "2026", "2027", "2033", "3052", "3131", "3158", "SK", "VF", "N1", "N2", "N3", "N4", "N5", "N6", "H26", "H28", "AC1", "AC2", "AC3", "AC4", "AC5", "AC6",
"AC14", "AC15", "AC7", "AC8", "AC9", "AC10", "AC11", "AC12", "AC13", "AP1", "AP2", "AP3", "AP4", "AP5", "AP6")
overlap_grouped_log <- overlap_log %>%
group_by(condition) %>%
group_split()
names(overlap_grouped_log) <- c("Control", "NSTEMI", "STEMI", "UA")
overlap_grouped_log_pca <- overlap_grouped_log %>%
lapply(FUN = function(x) {
x %>%
dplyr::select(tiles, log_transf_perc_meth, sample_ids) %>%
pivot_wider(names_from = tiles, values_from = log_transf_perc_meth) %>%
left_join(metadata %>% dplyr::select(sample_ids, seq_method, condition), by = "sample_ids")
})
# Plot
overlap_grouped_log_pca_plot <- overlap_grouped_log_pca %>%
map2(
names(.),
~ autoplot(prcomp(dplyr::select(.x, starts_with("chr"))), data = .x, colour = "seq_method") +
labs(title = .y, colour="Sequencing \nmethod") +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1)
)
annotate_figure(ggarrange(plotlist =overlap_grouped_log_pca_plot , nrow=2, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Raw Data Log Space \nAll Common Tiles", face = "bold", size=16))
overlap_DM_log <- overlap_log %>%
tidyr::unite(tiles, tiles, condition, sep = ".") %>%
filter(tiles %in% c(diff_meth_tiles)) %>%
tidyr::separate(tiles, into = c("chrom", "start", "end", "condition"), sep = "\\.") %>%
tidyr::unite(tiles, chrom, start, end, sep = ".")
overlap_DM_grouped_log_pca <- overlap_DM_log %>%
group_by(condition) %>%
group_split()
names(overlap_DM_grouped_log_pca) <- c("NSTEMI", "STEMI", "UA")
overlap_DM_grouped_log_pca <- overlap_DM_grouped_log_pca %>%
lapply(FUN = function(x) {
x %>%
dplyr::select(tiles, log_transf_perc_meth, sample_ids) %>%
pivot_wider(names_from = tiles, values_from = log_transf_perc_meth) %>%
left_join(metadata %>% dplyr::select(sample_ids, seq_method, condition), by = "sample_ids")
})
# Plot
overlap_DM_grouped_log_pca_plot <- overlap_DM_grouped_log_pca %>%
map2(
names(.),
~ autoplot(prcomp(dplyr::select(.x, starts_with("chr"))), data = .x, colour = "seq_method") +
labs(title = .y, colour="Sequencing \nmethod") +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1)
)
annotate_figure(ggarrange(plotlist =overlap_DM_grouped_log_pca_plot , nrow=2, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Raw Data Log Space \nDM Tiles", face = "bold", size=16))
# Perform quantile normalisation
# Seperate targeted and WGBS data into two tibbles
overlap_grouped_log_targeted <- overlap_grouped_log %>%
map(~ dplyr::filter(., seq_method=='targeted')%>%
pivot_wider(names_from = sample_ids, values_from = log_transf_perc_meth) )
overlap_grouped_log_WGBS <- overlap_grouped_log %>%
map(~ dplyr::filter(., seq_method=='WGBS')%>%
pivot_wider(names_from = sample_ids, values_from = log_transf_perc_meth) )
overlap_grouped_log_WGBS_distribution <- overlap_grouped_log_WGBS %>%
map(~ c(t(dplyr::select(., -c("tiles", "condition", "seq_method")))))
#normalise and set column and rownames
overlap_grouped_log_targeted_normalised <- overlap_grouped_log_targeted %>%
map2(overlap_grouped_log_WGBS_distribution, ~normalize.quantiles.use.target(as.matrix(dplyr::select(.x, -c("tiles", "condition", "seq_method"))), .y, copy=TRUE) %>%
as_tibble() %>%
set_names(colnames(dplyr::select(.x, -c("tiles", "condition", "seq_method")))) %>%
add_column(tiles=.x[["tiles"]], .before=0))
#Join targeted and WGBS dataframes
overlap_grouped_log_normalised <- overlap_grouped_log_targeted_normalised %>%
map2(overlap_grouped_log_WGBS, ~.x %>%
full_join(.y[,-c(2:3)], by="tiles") %>%
column_to_rownames("tiles") %>%
as.matrix() )
batch <- metadata %>%
mutate(seq_method_binary = case_when(seq_method=="WGBS" ~ 0, seq_method=="targeted" ~ 1)) %>%
dplyr::select(sample_ids, seq_method_binary) %>% deframe()
#Run combat function on values
overlap_grouped_log_normalised_combat<- overlap_grouped_log_normalised %>% map(~ sva::ComBat(dat=., batch=batch[colnames(.)])%>%
as_tibble(rownames = "tiles") %>%
pivot_longer(-c(tiles), names_to="sample_ids", values_to="log_transf_perc_meth" ) %>%
left_join(metadata, by="sample_ids") %>%
mutate(treatment=as.factor(treatment)))
# Plot PCAs per condition
overlap_grouped_log_normalised_combat_pca <- overlap_grouped_log_normalised_combat %>%
map2(names(.), ~ dplyr::select(.x, tiles, log_transf_perc_meth, sample_ids, seq_method) %>%
pivot_wider(names_from = tiles, values_from = log_transf_perc_meth) %>%
add_column(condition=.y) )
overlap_grouped_log_normalised_combat_plot_per_disease<- overlap_grouped_log_normalised_combat_pca %>%
map2(names(.), ~ autoplot(prcomp(dplyr::select(.x, starts_with("chr"))), data = .x, colour = "seq_method") +
labs(title = .y, colour="Sequencing \nmethod") +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1))
#plot_grid(plotlist = overlap_grouped_log_normalised_combat_plot_per_disease, nrow = 2, ncol = 2, labels = "AUTO")
annotate_figure(ggarrange(plotlist =overlap_grouped_log_normalised_combat_plot_per_disease , nrow=2, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Log Transformed, Quantile Normalised and ComBat\nDM Tiles", face = "bold", size=16))
# Perform PCA per condition on DM tiles
overlap_DM_grouped_log_normalised_combat <- overlap_grouped_log_normalised_combat %>% map(~ tidyr::unite(., tiles, tiles, condition, sep = ".") %>%
dplyr::filter(tiles %in% c(diff_meth_tiles)) %>%
tidyr::separate(tiles, into = c("chrom", "start", "end", "condition"), sep = "\\.") %>%
tidyr::unite(tiles, chrom, start, end, sep = "."))
overlap_DM_grouped_log_normalised_combat <- overlap_DM_grouped_log_normalised_combat[2:4]
overlap_DM_grouped_log_normalised_combat_pca <- overlap_DM_grouped_log_normalised_combat %>%
map(~ dplyr::select(., tiles, log_transf_perc_meth, sample_ids, seq_method, condition) %>%
pivot_wider(names_from = tiles, values_from = log_transf_perc_meth))
overlap_DM_grouped_log_normalised_combat_plot_seperate_diseases <- overlap_DM_grouped_log_normalised_combat_pca %>%
map2(names(.), ~ autoplot(prcomp(dplyr::select(.x, starts_with("chr"))), data = .x, colour = "seq_method") +
labs(title = .y, colour="Sequencing \nmethod") +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1))
annotate_figure(ggarrange(plotlist = overlap_DM_grouped_log_normalised_combat_plot_seperate_diseases , nrow=2, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Log Transformed, Quantile Normalised and ComBat\nDM Tiles", face = "bold", size=16))
calc_log_fold_change <- function(log_dataframe) {
log_dataframe %>%
group_by(tiles, seq_method, condition) %>%
summarise_at(vars(log_transf_perc_meth), list(name = mean)) %>%
dplyr::rename(mean = "name") %>%
pivot_wider(values_from = mean, names_from = condition) %>%
mutate(
meth.diff_STEMI = STEMI - Control,
meth.diff_NSTEMI = NSTEMI - Control,
meth.diff_UA = UA - Control
) %>%
dplyr::select(-c(Control, NSTEMI, STEMI, UA)) %>%
dplyr::rename(STEMI = "meth.diff_STEMI", NSTEMI = "meth.diff_NSTEMI", UA = "meth.diff_UA") %>%
pivot_longer(-c(tiles, seq_method),
names_to = "condition",
values_to = "log_fold_change"
) %>%
pivot_wider(names_from = "seq_method", values_from = "log_fold_change") %>%
dplyr::rename(log_fold_change_WGBS = "WGBS", log_fold_change_targeted = "targeted")
}
overlap_log_normalised_combat <- overlap_grouped_log_normalised_combat %>%
bind_rows() %>%
dplyr::select(-c(treatment_descr, treatment))
overlap_grouped_log_normalised_combat_log_fold_change <- overlap_grouped_log_normalised_combat %>%
bind_rows() %>%
dplyr::select(-c(treatment_descr, treatment)) %>%
calc_log_fold_change()
data_for_scatterplot <- data_for_scatterplot %>%
tidyr::unite(tiles, seqnames, start, end, group, sep = ".")
overlap_DM_grouped_log_normalised_combat_log_fold_change <- overlap_grouped_log_normalised_combat_log_fold_change %>%
tidyr::unite(tiles, tiles, condition, sep = ".") %>%
dplyr::filter(tiles %in% c(data_for_scatterplot$tiles)) %>%
tidyr::separate(tiles, into = c("chrom", "start", "end", "condition"), sep = "\\.")
data_for_scatterplot <- data_for_scatterplot %>%
tidyr::separate(tiles, into = c("seqnames", "start", "end", "group"), sep = "\\.")
# Plot
p39 <- overlap_DM_grouped_log_normalised_combat_log_fold_change %>% ggplot(aes(x = log_fold_change_targeted, y = log_fold_change_WGBS)) +
geom_point(colour = "#7CAE00", shape = 16, size=2, alpha=0.6) +
ylab("log fold change WGBS") +
xlab("log fold change Targeted Sequencing") +
ylim(-5, 5) +
labs(title = "Quantile Normalisation and ComBat") +
stat_cor(method="pearson")+
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1) +
facet_grid(cols = vars(condition))
annotate_figure(ggarrange(p39 , nrow=1, ncol=1), text_grob("DM Tiles", face = "bold", size=16))
e_fun <- function(observed) {
result <- ((exp(observed) * 100) / (1 + exp(observed)))
}
overlap_log_normalised_combat <- overlap_log_normalised_combat %>%
dplyr::select(-c("seq_method", "condition")) %>%
pivot_wider(names_from = sample_ids, values_from = log_transf_perc_meth)
overlap_log_normalised_combat_no_log <- overlap_log_normalised_combat %>%
dplyr::select_if(is.numeric) %>%
mutate_all(list(perc_meth = ~ e_fun(.))) %>%
inner_join(overlap_log_normalised_combat) %>%
dplyr::select(-c(1:40)) %>%
relocate("tiles") %>%
pivot_longer(-tiles, names_to = "sample_ids", values_to = "perc_meth") %>%
mutate(across("sample_ids", str_replace, "_perc_meth", "")) %>%
left_join(metadata %>% dplyr::select(sample_ids, condition, seq_method), by = "sample_ids")
Joining, by = c("SK", "VF", "N1", "N2", "N3", "N4", "N5", "N6", "H26", "H28", "2019", "2033", "3131", "AC7", "AC8", "AC9", "AC10", "AC11", "AC12", "AC13", "2017", "2018", "3052", "3158",
"AC1", "AC2", "AC3", "AC4", "AC5", "AC6", "AC14", "AC15", "2026", "2027", "AP1", "AP2", "AP3", "AP4", "AP5", "AP6")
p40 <- overlap_log_normalised_combat_no_log %>% ggplot(aes(x = condition, y = perc_meth, fill = seq_method)) +
geom_boxplot() +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5, face="bold"))
p41 <- overlap_log_normalised_combat_no_log %>% ggplot(aes(x = condition, y = perc_meth, fill = seq_method)) +
geom_violin() +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5, face="bold"))
p42 <- overlap_log_normalised_combat_no_log %>% ggplot(aes(x = condition, y = perc_meth)) +
geom_violin(aes(fill = seq_method)) +
geom_boxplot(aes(fill = seq_method), width = 0.05) +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5, face="bold")) +
facet_grid(seq_method ~ .)
p43 <- overlap_log_normalised_combat_no_log %>% ggplot(aes(x = condition, y = perc_meth)) +
geom_violin(fill = "#7C7CFF") +
ylab("Methylation [%]") +
xlab("ACS Type") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5, face="bold"))
annotate_figure(ggarrange(p40, p41, p42, p43 , nrow=2, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Log-Backtransformed, Quantile Normalised and ComBat\nAll Common Tiles", face = "bold", size=16))
p44 <- overlap_log_normalised_combat_no_log %>%
filter(seq_method == "WGBS") %>%
ggplot(aes(x = sample_ids, y = perc_meth)) +
geom_violin(fill = "#F8766D") +
geom_boxplot(width = 0.05) +
ylab("Methylation [%]") +
xlab("Sample") +
labs(title = "WGBS") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5, face="bold"))
p45 <- overlap_log_normalised_combat_no_log %>%
filter(seq_method == "targeted") %>%
ggplot(aes(x = sample_ids, y = perc_meth)) +
geom_violin(fill = "#00BFC4") +
geom_boxplot(width = 0.05) +
ylab("Methylation [%]") +
xlab("Sample") +
labs(title = "Targeted Sequencing") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5, face="bold"))
p46 <- overlap_log_normalised_combat_no_log %>% ggplot(aes(x = sample_ids, y = perc_meth, fill = seq_method)) +
geom_violin() +
geom_boxplot(width = 0.05) +
ylab("Methylation [%]") +
xlab("Sample") +
labs(title = "Log-Backtransformed, Quantile Normalised and ComBat\nAll Common Tiles") +
scale_fill_discrete(name = "Method") +
theme(plot.title = element_text(hjust = 0.5, face="bold"), axis.text.x = element_text(angle = 45, vjust = 0.8, hjust = 1))
annotate_figure(ggarrange(p44, p45, nrow=2, ncol=1, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Log-Backtransformed, Quantile Normalised and ComBat\nAll Common Tiles", face = "bold", size=16))
p46
overlap_log_normalised_combat_no_log_plot <- overlap_log_normalised_combat_no_log %>%
pivot_wider(names_from = tiles, values_from = perc_meth)
p56 <- overlap_log_normalised_combat_no_log_plot %>%
autoplot(prcomp(dplyr::select(., starts_with("chr"))), data = ., colour = "condition") +
labs(colour="Condition") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 1)
p57 <- overlap_log_normalised_combat_no_log_plot %>%
autoplot(prcomp(dplyr::select(., starts_with("chr"))), data = ., colour = "seq_method") +
labs(colour="Sequencing \nmethod") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 1)
p58 <- overlap_log_normalised_combat_no_log_plot %>%
autoplot(prcomp(dplyr::select(., starts_with("chr"))), data = ., shape = "seq_method", colour="condition") +
labs(title = "Backtransformed, Quantile Normalised and ComBat\nAll Common Tiles", shape="Sequencing \nmethod", colour="Condition") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 1)
annotate_figure(ggarrange(p56, p57 , nrow=1, ncol=2, labels="AUTO"), text_grob("Backtransformed, Quantile Normalised and ComBat\nAll Common Tiles", face = "bold", size=16))
p58
# Perform PCA per condition on DM tiles
overlap_grouped_log_normalised_combat_log_fold_change_no_log <- overlap_log_normalised_combat_no_log %>%
group_by(condition) %>%
group_split()
overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log <- overlap_grouped_log_normalised_combat_log_fold_change_no_log %>%
map(~ tidyr::unite(., tiles, tiles, condition, sep = ".") %>%
dplyr::filter(tiles %in% c(diff_meth_tiles)) %>%
tidyr::separate(tiles, into = c("chrom", "start", "end", "condition"), sep = "\\.") %>%
tidyr::unite(tiles, chrom, start, end, sep = "."))
overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log <- overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log[2:4]
overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log_pca <- overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log %>%
map(~ dplyr::select(., tiles, perc_meth, sample_ids, seq_method, condition) %>%
pivot_wider(names_from = tiles, values_from = perc_meth))
names(overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log_pca) <- c("NSTEMI", "STEMI", "UA")
overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log_plot_seperate_diseases <- overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log_pca %>%
map2(names(.), ~ autoplot(prcomp(dplyr::select(.x, starts_with("chr"))), data = .x, colour = "seq_method") +
labs(title = .y, colour="Sequencing \nmethod") +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1))
annotate_figure(ggarrange(plotlist = overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log_plot_seperate_diseases , nrow=2, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Backtransformed, Quantile Normalised and ComBat\nDM Tiles", face = "bold", size=16))
DM_tiles_NSTEMI <- overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log_pca[["NSTEMI"]] %>% dplyr::select(starts_with("chr")) %>% colnames()
DM_tiles_STEMI <- overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log_pca[["STEMI"]] %>% dplyr::select(starts_with("chr")) %>% colnames()
DM_tiles_UA <- overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log_pca[["UA"]] %>% dplyr::select(starts_with("chr")) %>% colnames()
DM_tiles_NSTEMI_unique <- setdiff(DM_tiles_NSTEMI, DM_tiles_STEMI)
DM_tiles_NSTEMI_unique <- setdiff(DM_tiles_NSTEMI_unique, DM_tiles_UA)
DM_tiles_STEMI_unique <- setdiff(DM_tiles_STEMI, DM_tiles_NSTEMI)
DM_tiles_STEMI_unique <- setdiff(DM_tiles_STEMI_unique, DM_tiles_UA)
DM_tiles_UA_unique <- setdiff(DM_tiles_UA, DM_tiles_NSTEMI)
DM_tiles_UA_unique <- setdiff(DM_tiles_UA_unique, DM_tiles_STEMI)
length(DM_tiles_NSTEMI_unique) #187
[1] 187
length(DM_tiles_STEMI_unique) #431
[1] 431
length(DM_tiles_UA_unique) #473
[1] 473
overlap_grouped_unique_DM_log_normalised_combat_log_fold_change_no_log_pca <- list()
overlap_grouped_unique_DM_log_normalised_combat_log_fold_change_no_log_pca[["NSTEMI"]] <- overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log_pca[["NSTEMI"]] %>%
dplyr::select(sample_ids, seq_method, condition, DM_tiles_NSTEMI_unique)
Note: Using an external vector in selections is ambiguous.
i Use `all_of(DM_tiles_NSTEMI_unique)` instead of `DM_tiles_NSTEMI_unique` to silence this message.
i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
overlap_grouped_unique_DM_log_normalised_combat_log_fold_change_no_log_pca[["STEMI"]] <- overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log_pca[["STEMI"]] %>%
dplyr::select(sample_ids, seq_method, condition, DM_tiles_STEMI_unique)
Note: Using an external vector in selections is ambiguous.
i Use `all_of(DM_tiles_STEMI_unique)` instead of `DM_tiles_STEMI_unique` to silence this message.
i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
overlap_grouped_unique_DM_log_normalised_combat_log_fold_change_no_log_pca[["UA"]] <- overlap_grouped_DM_log_normalised_combat_log_fold_change_no_log_pca[["UA"]] %>%
dplyr::select(sample_ids, seq_method, condition, DM_tiles_UA_unique)
Note: Using an external vector in selections is ambiguous.
i Use `all_of(DM_tiles_UA_unique)` instead of `DM_tiles_UA_unique` to silence this message.
i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
overlap_grouped_unique_DM_log_normalised_combat_log_fold_change_no_log_plot_seperate_diseases <- overlap_grouped_unique_DM_log_normalised_combat_log_fold_change_no_log_pca %>%
map2(names(.), ~ autoplot(prcomp(dplyr::select(.x, starts_with("chr"))), data = .x, colour = "seq_method") +
labs(title = .y, colour="Sequencing \nmethod") +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1))
annotate_figure(ggarrange(plotlist = overlap_grouped_unique_DM_log_normalised_combat_log_fold_change_no_log_plot_seperate_diseases , nrow=2, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Backtransformed, Quantile Normalised and ComBat\nUnique DM Tiles", face = "bold", size=16))
unique_DM_tiles <- c(unlist(DM_tiles_NSTEMI_unique), unlist(DM_tiles_STEMI_unique), unlist(DM_tiles_UA_unique))
#Filter methylation values for all per disease uniquely differentially methylated tiles
unique_DM_perc_meth <- overlap_log_normalised_combat_no_log %>% dplyr::filter(tiles %in% unique_DM_tiles)
unique_DM_perc_meth_pca <- unique_DM_perc_meth %>%
tidyr::pivot_wider(values_from=perc_meth, names_from=tiles)
p63 <- unique_DM_perc_meth_pca %>%
autoplot(prcomp(dplyr::select(., starts_with("chr"))), data = ., colour = "condition") +
labs(colour="Condition") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 1)
p64 <- unique_DM_perc_meth_pca %>%
autoplot(prcomp(dplyr::select(., starts_with("chr"))), data = ., colour = "seq_method") +
labs(colour="Sequencing \nmethod") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 1)
p65 <- unique_DM_perc_meth_pca %>%
autoplot(prcomp(dplyr::select(., starts_with("chr"))), data = ., shape = "seq_method", colour="condition") +
labs(title = "Backtransformed, Quantile Normalised and ComBat\nAll unique DM Tiles", shape="Sequencing \nmethod", colour="Condition") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 1)
annotate_figure(ggarrange(p63, p64 , nrow=1, ncol=2, labels="AUTO"), text_grob("Backtransformed, Quantile Normalised and ComBat\nAll unique DM Tiles", face = "bold", size=16))
p65
calc_diff_of_means <- function(dataframe_all_samples, tiles.condition) {
dataframe_all_samples_mean <- dataframe_all_samples %>%
group_by(tiles, seq_method, condition) %>%
summarise_at(vars(perc_meth), list(name = mean)) %>%
dplyr::rename(mean = "name")
dataframe_all_samples_mean_diff <- dataframe_all_samples_mean %>%
pivot_wider(values_from = mean, names_from = condition) %>%
mutate(
meth.diff_STEMI = STEMI - Control,
meth.diff_NSTEMI = NSTEMI - Control,
meth.diff_UA = UA - Control
) %>%
dplyr::select(-c(Control, NSTEMI, STEMI, UA)) %>%
dplyr::rename(STEMI = "meth.diff_STEMI", NSTEMI = "meth.diff_NSTEMI", UA = "meth.diff_UA") %>%
pivot_longer(-c(tiles, seq_method),
names_to = "condition",
values_to = "meth_diff"
) %>%
pivot_wider(names_from = "seq_method", values_from = "meth_diff") %>%
dplyr::rename(meth.diff.WGBS = "WGBS", meth.diff.targeted = "targeted")
dataframe_all_samples_mean_diff_tiles <- dataframe_all_samples_mean_diff %>%
tidyr::unite(tiles, tiles, condition, sep = ".") %>%
filter(tiles %in% c(tiles.condition)) %>%
tidyr::separate(tiles, into = c("chrom", "start", "end", "condition"), sep = "\\.")
}
overlap_DM_log_normalised_combat_log_fold_change_no_log_mean_diff <- calc_diff_of_means(overlap_log_normalised_combat_no_log, diff_meth_tiles)
#Calculate threshold for scatterplot
overlap_DM_log_normalised_combat_log_fold_change_no_log_mean_diff <- overlap_DM_log_normalised_combat_log_fold_change_no_log_mean_diff %>%
mutate(threshold_25= case_when((((25 <= meth.diff.WGBS) & (25 <= meth.diff.targeted)) | (((meth.diff.WGBS <=-25) & (meth.diff.targeted <=-25))) ) ~ "|25%|", TRUE ~ "not in range"))
p47 <- overlap_DM_log_normalised_combat_log_fold_change_no_log_mean_diff %>%
ggplot(aes(x = meth.diff.targeted, y = meth.diff.WGBS, colour=threshold_25)) +
geom_point(shape = 16, size=2, alpha=0.6) +
geom_vline(xintercept = c(-25, 25), linetype="dotted")+
geom_hline(yintercept = c(-25, 25), linetype="dotted")+
ylab("Mean Methylation Difference WGBS") +
xlab("Mean Methylation Difference Targeted Sequencing") +
labs(title = "Log-Backtransformed, Quantile Normalised and ComBat", colour="Threshold") +
stat_cor(method="pearson")+
ylim(-70, 70) +
xlim(-70, 70) +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1) +
scale_color_manual(values=c("#919494", "#7CAE00"))+
facet_grid(cols = vars(condition))
annotate_figure(ggarrange(p47 , nrow=1, ncol=1), text_grob("DM Tiles", face = "bold", size=16))
# Plot distribution of mean DM per condition boxplots
p48 <- overlap_DM_log_normalised_combat_log_fold_change_no_log_mean_diff %>% ggplot(aes(x = condition, y = meth.diff.targeted, fill = condition)) +
geom_boxplot() +
ylab("Mean Methylation Difference [%]") +
xlab("ACS Type") +
labs(title = "Targeted Sequencing") +
scale_fill_discrete(name = "ACS Type") +
theme(plot.title = element_text(hjust = 0.5)) +
ylim(-100, 80)
p49 <- overlap_DM_log_normalised_combat_log_fold_change_no_log_mean_diff %>% ggplot(aes(x = condition, y = meth.diff.WGBS, fill = condition)) +
geom_boxplot() +
ylab("Mean Methylation Difference [%]") +
xlab("ACS Type") +
labs(title = "WGBS") +
scale_fill_discrete(name = "ACS Type") +
ylim(-100, 80) +
theme(plot.title = element_text(hjust = 0.5))
annotate_figure(ggarrange(p48, p49 , nrow=1, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Log-Backtransformed, Quantile Normalised and ComBat\nDM Tiles", face = "bold", size=16))
#Add qvalue to dataframe and calculate if threshold for qvalue below 0.01
lim=65
overlap_DM_log_normalised_combat_log_fold_change_no_log_mean_diff_qvalues <- overlap_DM_log_normalised_combat_log_fold_change_no_log_mean_diff %>%
tidyr::unite(tiles, chrom, start, end, condition, sep=".") %>%
left_join(dplyr::select(bind_rows(methylDiff_validation), tiles, qvalue), by="tiles") %>%
dplyr::rename(qvalue_targeted= qvalue) %>%
left_join(dplyr::select(bind_rows(methylDiff_WGBS), tiles, qvalue), by="tiles") %>%
dplyr::rename(qvalue_WGBS=qvalue) %>%
tidyr::separate(tiles, into=c("chrom", "start", "end", "condition")) %>%
dplyr::mutate(threshold_25_qvalue= case_when(threshold_25 =="|25%|" & qvalue_targeted <=0.01 & qvalue_WGBS <=0.01 ~ "significant", TRUE ~ "insignificant"))
p59 <- overlap_DM_log_normalised_combat_log_fold_change_no_log_mean_diff_qvalues %>%
ggplot(aes(x = meth.diff.targeted, y = meth.diff.WGBS, colour=threshold_25_qvalue)) +
geom_point(shape = 16, alpha=0.6, size=2) +
geom_vline(xintercept = c(-25, 25), linetype="dotted")+
geom_hline(yintercept = c(-25, 25), linetype="dotted")+
ylab("Mean Methylation Difference WGBS") +
xlab("Mean Methylation Difference Targeted Sequencing") +
labs(title = "Log-Backtransformed, Quantile Normalised and ComBat", colour="Threshold") +
stat_cor(method="pearson")+
ylim(-lim, lim) +
xlim(-lim, lim) +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1) +
scale_color_manual(values=c("#919494", "#7CAE00"))+
facet_grid(cols = vars(condition))
annotate_figure(ggarrange(p59 , nrow=1, ncol=1), text_grob("DM Tiles", face = "bold", size=16))
#Get all tiles of differential methylation regardless of percentage
all_DM_tiles<- overlap_DM_log_normalised_combat_log_fold_change_no_log_mean_diff_qvalues %>%
#filter(threshold_25 == "|25%|") %>%
tidyr::unite(tiles, chrom, start, end, sep=".") %>%
pull(tiles) %>%
unique()
#Filter methylation values for all differentially methylated tiles
all_DM_perc_meth <- overlap_log_normalised_combat_no_log %>% dplyr::filter(tiles %in% all_DM_tiles)
all_DM_perc_meth_pca <- all_DM_perc_meth %>%
tidyr::pivot_wider(values_from=perc_meth, names_from=tiles)
p60 <- all_DM_perc_meth_pca %>%
autoplot(prcomp(dplyr::select(., starts_with("chr"))), data = ., colour = "condition") +
labs(colour="Condition") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 1)
p61 <- all_DM_perc_meth_pca %>%
autoplot(prcomp(dplyr::select(., starts_with("chr"))), data = ., colour = "seq_method") +
labs(colour="Sequencing \nmethod") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 1)
p62 <- all_DM_perc_meth_pca %>%
autoplot(prcomp(dplyr::select(., starts_with("chr"))), data = ., shape = "seq_method", colour="condition") +
labs(title = "Backtransformed, Quantile Normalised and ComBat\nAll DM Tiles", shape="Sequencing \nmethod", colour="Condition") +
theme(plot.title = element_text(hjust = 0.5, face = "bold"), aspect.ratio = 1)
annotate_figure(ggarrange(p60, p61 , nrow=1, ncol=2, labels="AUTO"), text_grob("Backtransformed, Quantile Normalised and ComBat\nAll DM Tiles", face = "bold", size=16))
p62
#Alternative method
# test<- reduce(group_split(all_DM_perc_meth, condition), inner_join, by=tiles)
p66
Warning: Removed 3 rows containing non-finite values (stat_cor).
Warning: Removed 3 rows containing missing values (geom_point).
cohort_comparison_validation_numbers_q_value_directional <- compare_discovery_validation %>%
group_by(condition) %>%
dplyr::count(threshold_qvalue_directional =="significant") %>%
dplyr::rename(threshold_q_value_directional=2) %>%
pivot_wider(names_from=threshold_q_value_directional, values_from=n) %>%
dplyr::rename(Significant_DMR=3, Insignificant_DMR=2) %>%
mutate(Percentage_Significant_DMR=(Significant_DMR/(Insignificant_DMR+Significant_DMR)*100),
Total_Number_DMRs=sum(Significant_DMR, Insignificant_DMR)) %>%
dplyr::relocate(Total_Number_DMRs, .after=condition)
cohort_comparison_validation_numbers_q_value_25_cutoff <- compare_discovery_validation %>%
dplyr::filter(threshold_qvalue == "significant") %>%
group_by(condition) %>%
dplyr::count(threshold_25_cohort_comparison == "DMR validated" & threshold_qvalue == "significant") %>%
dplyr::rename(threshold_25=2) %>%
pivot_wider(names_from=threshold_25, values_from=n) %>%
dplyr::rename(Significant_DMR_Validated=3, Significant_DMR_Not_Validated=2) %>%
mutate(Percentage_Significant_DMR_Validated=(Significant_DMR_Validated/(Significant_DMR_Not_Validated+Significant_DMR_Validated)*100),
Total_Number_DMRs=sum(Significant_DMR_Not_Validated, Significant_DMR_Validated)) %>%
dplyr::relocate(Total_Number_DMRs, .after=condition)
cohort_comparison_validation_numbers_q_value_directional_25_cutoff <- compare_discovery_validation %>%
dplyr::filter(threshold_qvalue_directional == "significant") %>%
group_by(condition) %>%
dplyr::count(threshold_25_cohort_comparison == "DMR validated" & threshold_qvalue == "significant") %>%
dplyr::rename(threshold_25=2) %>%
pivot_wider(names_from=threshold_25, values_from=n) %>%
dplyr::rename(Significant_DMR_Validated=3, Significant_DMR_Not_Validated=2) %>%
mutate(Percentage_Significant_DMR_Validated=(Significant_DMR_Validated/(Significant_DMR_Not_Validated+Significant_DMR_Validated)*100),
Total_Number_DMRs=sum(Significant_DMR_Not_Validated, Significant_DMR_Validated)) %>%
dplyr::relocate(Total_Number_DMRs, .after=condition)
print(cohort_comparison_validation_numbers_q_value_directional)
print(cohort_comparison_validation_numbers_q_value_25_cutoff)
print(cohort_comparison_validation_numbers_q_value_directional_25_cutoff)
To find out if DMR validation might be associable with clinical marker expression
#First try to use quantile and combat normalised perc_meth values:
DMRs <- compare_discovery_validation %>%
dplyr::select(chrom, start, end, condition, threshold_25_cohort_comparison) %>%
tidyr::unite(DMR, chrom, start, end, sep=".") %>%
group_split(threshold_25_cohort_comparison) %>%
map(~pull(., DMR) %>% unique())
#Join with clinical Data
overlap_log_normalised_combat_no_log_plot_clinical_data<-overlap_log_normalised_combat_no_log_plot %>%
dplyr::select(sample_ids, starts_with("chr")) %>%
dplyr::inner_join(clinical_markers_ACS, by = c("sample_ids"="Sample"))
#Calculate Correlation
validation_corr_biomarkers <- overlap_log_normalised_combat_no_log_plot_clinical_data %>%
dplyr::select(any_of(c(DMRs[[1]], DMRs[[2]])), "LVEF(%)", "CRP(mg/dl_<5)","TroponinThs_t1 (ng/l <14or<50)", "CK (U/l _<190)" , "CKmax(U/l_<190)" ,"CK-MB(U/l <24)","CK-MB_Max") %>%
stats::cor(use = "complete.obs",method = "pearson") %>%
as_tibble(rownames="DMR") %>%
dplyr::mutate(across(where(is.numeric), ~.^2)) %>%
dplyr::select(DMR, "LVEF(%)", "CRP(mg/dl_<5)","TroponinThs_t1 (ng/l <14or<50)", "CK (U/l _<190)" , "CKmax(U/l_<190)" ,"CK-MB(U/l <24)","CK-MB_Max")
#Annotate with validation status
validation_corr_biomarkers <- validation_corr_biomarkers %>%
dplyr::mutate(DMRs_validation=if_else(DMR %in% DMRs[[2]], "validated", "not validated"))
validation_corr_biomarkers_long <- validation_corr_biomarkers %>%
dplyr::rename('CRP (mg/dl)'= "CRP(mg/dl_<5)",
'TroponinT (ng/l)'="TroponinThs_t1 (ng/l <14or<50)",
'CK (U/l)'="CK (U/l _<190)",
'CK max (U/l)'="CKmax(U/l_<190)",
'CK-MB (U/l)'="CK-MB(U/l <24)",
'CK-MB max (U/l)'="CK-MB_Max") %>%
tidyr::pivot_longer(-c(DMR, DMRs_validation), names_to = "biomarker", values_to="Rsquared")
#Plot
clinical_marker_correlation <- validation_corr_biomarkers_long %>%
ggplot(aes(x = DMRs_validation, y = Rsquared, fill=DMRs_validation)) +
geom_boxplot(outlier.size=0.3) +
ggpubr::stat_compare_means(size=2, label.x=1.2, label.y=1.1)+
labs(y=expression("R"^2), x="DMRs") +
ylim(0,1.25)+
scale_fill_manual(name= "DMRs", values=c("darkred", "darkgreen")) +
theme_bw()+
facet_wrap(vars(biomarker))+
theme(strip.text = element_text(size = 6), axis.text=element_text(size=6))
clinical_marker_correlation
# ggsave(bg ="white",
# "/home/arathge/output/plots/ACSS_cardiac_ccfDNA/clinical_marker_correlation.tiff",
# device = "tiff",
# plot = clinical_marker_correlation
# )
# Try also with unnormalised raw perc_meth values:
#Transpose tibble
overlap_tiles_validation_WGBS_t <- overlap_tiles_validation_WGBS %>%
pivot_longer(-tiles) %>%
pivot_wider(names_from = tiles, values_from = value) %>%
dplyr::rename(sample_ids="name")
#Join with Clinical Data
overlap_tiles_validation_WGBS_t_clinical_data<-overlap_tiles_validation_WGBS_t %>%
dplyr::select(sample_ids, starts_with("chr")) %>%
dplyr::inner_join(clinical_markers_ACS, by = c("sample_ids"="Sample"))
#Calculate Correlation
validation_corr_biomarkers_raw_data <- overlap_tiles_validation_WGBS_t_clinical_data %>%
dplyr::select(any_of(c(DMRs[[1]], DMRs[[2]])), "LVEF(%)", "CRP(mg/dl_<5)","TroponinThs_t1 (ng/l <14or<50)", "CK (U/l _<190)" , "CKmax(U/l_<190)" ,"CK-MB(U/l <24)","CK-MB_Max") %>%
stats::cor(use = "complete.obs",method = "pearson") %>%
as_tibble(rownames="DMR") %>%
dplyr::mutate(across(where(is.numeric), ~.^2)) %>%
dplyr::select(DMR, "LVEF(%)", "CRP(mg/dl_<5)","TroponinThs_t1 (ng/l <14or<50)", "CK (U/l _<190)" , "CKmax(U/l_<190)" ,"CK-MB(U/l <24)","CK-MB_Max")
#Annotate with validation status
validation_corr_biomarkers_raw_data <- validation_corr_biomarkers_raw_data %>%
dplyr::mutate(DMRs_validation=if_else(DMR %in% DMRs[[2]], "validated", "not validated"))
validation_corr_biomarkers_raw_data_long <- validation_corr_biomarkers_raw_data %>%
dplyr::rename('CRP (mg/dl)'= "CRP(mg/dl_<5)",
'TroponinT (ng/l)'="TroponinThs_t1 (ng/l <14or<50)",
'CK (U/l)'="CK (U/l _<190)",
'CK max (U/l)'="CKmax(U/l_<190)",
'CK-MB (U/l)'="CK-MB(U/l <24)",
'CK-MB max (U/l)'="CK-MB_Max") %>%
tidyr::pivot_longer(-c(DMR, DMRs_validation), names_to = "biomarker", values_to="Rsquared")
#Plot
clinical_marker_correlation_raw_data <- validation_corr_biomarkers_raw_data_long %>%
ggplot(aes(x = DMRs_validation, y = Rsquared, fill=DMRs_validation)) +
geom_boxplot(outlier.size=0.3) +
ggpubr::stat_compare_means(size=2, label.x=1.2, label.y=1.1)+
labs(y=expression("R"^2), x="DMRs") +
ylim(0,1.25)+
scale_fill_manual(name= "DMRs", values=c("darkred", "darkgreen")) +
theme_bw()+
facet_wrap(vars(biomarker))+
theme(strip.text = element_text(size = 6), axis.text=element_text(size=6))
clinical_marker_correlation_raw_data
# ggsave(bg ="white",
# "/home/arathge/output/plots/ACSS_cardiac_ccfDNA/clinical_marker_correlation_raw_data.tiff",
# device = "tiff",
# plot = clinical_marker_correlation_raw_data
# )
methCovPerBaseWGBS <- setNames(methylKit::getData(methylBaseDB_WGBS_all_samples)[, c(1, 2, 3, methylBaseDB_WGBS_all_samples@coverage.index)], c("chr", "start", "end", methylBaseDB_WGBS_all_samples@sample.ids))
Uncompressing file.
Reading file.
methCovPerBaseTargeted <- setNames(methylKit::getData(methylBaseDB_validation_no_CAD)[, c(1, 2, 3, methylBaseDB_validation_no_CAD@coverage.index)], c("chr", "start", "end", methylBaseDB_validation_no_CAD@sample.ids))
#detach("package:methylKit", unload = TRUE)
#detach("package:GenomicRanges", unload = TRUE)
methCovPerBaseWGBS <- methCovPerBaseWGBS %>%
tidyr::unite(tiles, chr, start, end, sep = ".") %>%
dplyr::filter(tiles %in% c(tiles))
methCovPerBaseTargeted <- methCovPerBaseTargeted %>%
tidyr::unite(tiles, chr, start, end, sep = ".") %>%
dplyr::filter(tiles %in% c(tiles))
methCovPerBase <- methCovPerBaseWGBS %>% inner_join(methCovPerBaseTargeted)
Joining, by = "tiles"
methCovPerBase <- methCovPerBase %>% pivot_longer(-tiles, names_to = "sample_ids", values_to = "coverage")
all_data_coverage <- all_data %>% left_join(methCovPerBase, by = c("tiles", "sample_ids"))
DM_data_coverage <- all_data_coverage %>%
tidyr::unite(tiles, tiles, condition, sep = ".") %>%
filter(tiles %in% diff_meth_tiles) %>%
tidyr::separate(tiles, into = c("chrom", "start", "end", "condition"))
overlap_log_normalised_combat_no_log_weighted <- overlap_log_normalised_combat_no_log %>% left_join(dplyr::select(all_data_coverage, tiles, sample_ids, coverage), by = c("tiles", "sample_ids"))
calc_weighted_diff_of_means <- function(dataframe_all_samples) {
dataframe_all_samples_mean <- dataframe_all_samples %>%
dplyr::group_by(tiles, seq_method, condition) %>%
dplyr::summarise(weighted_mean = weighted.mean(perc_meth, coverage))
dataframe_all_samples_mean_diff <- dataframe_all_samples_mean %>%
pivot_wider(values_from = weighted_mean, names_from = condition) %>%
mutate(
meth.diff_STEMI = STEMI - Control,
meth.diff_NSTEMI = NSTEMI - Control,
meth.diff_UA = UA - Control
) %>%
dplyr::select(-c(Control, NSTEMI, STEMI, UA)) %>%
dplyr::rename(STEMI = "meth.diff_STEMI", NSTEMI = "meth.diff_NSTEMI", UA = "meth.diff_UA") %>%
pivot_longer(-c(tiles, seq_method),
names_to = "condition",
values_to = "weighted_meth_diff"
) %>%
pivot_wider(names_from = "seq_method", values_from = "weighted_meth_diff") %>%
dplyr::rename(weighted.meth.diff.WGBS = "WGBS", weighted.meth.diff.targeted = "targeted")
}
overlap_log_normalised_combat_no_log_weighted <- calc_weighted_diff_of_means(overlap_log_normalised_combat_no_log_weighted)
`summarise()` has grouped output by 'tiles', 'seq_method'. You can override using the `.groups` argument.
overlap_DM_log_normalised_combat_log_fold_change_no_log_weighted <- overlap_log_normalised_combat_no_log_weighted %>%
tidyr::unite(tiles, tiles, condition, sep = ".") %>%
dplyr::filter(tiles %in% c(diff_meth_tiles)) %>%
tidyr::separate(tiles, into = c("chrom", "start", "end", "condition"), sep = "\\.") %>%
tidyr::unite(tiles, chrom, start, end, sep=".")
#Calculate threshold
overlap_log_normalised_combat_no_log_weighted <- overlap_log_normalised_combat_no_log_weighted %>%
mutate(threshold_25= case_when((((25 <= weighted.meth.diff.WGBS) & (25 <= weighted.meth.diff.targeted)) | (((weighted.meth.diff.targeted <=-25) & (weighted.meth.diff.WGBS <=-25))) ) ~ "|25%|", TRUE ~ "not in range"))
overlap_DM_log_normalised_combat_log_fold_change_no_log_weighted <- overlap_DM_log_normalised_combat_log_fold_change_no_log_weighted %>%
mutate(threshold_25= case_when((((25 <= weighted.meth.diff.WGBS) & (25 <= weighted.meth.diff.targeted)) | (((weighted.meth.diff.targeted <=-25) & (weighted.meth.diff.WGBS <=-25))) ) ~ "|25%|", TRUE ~ "not in range"))
#Count how many tiles are above/below threshold for each condition
overlap_log_normalised_combat_no_log_weighted_validated <- overlap_log_normalised_combat_no_log_weighted %>%
dplyr::group_by(condition) %>%
dplyr::count(threshold_25 =="|25%|") %>%
dplyr::rename(threshold_25=2) %>%
pivot_wider(names_from=threshold_25, values_from=n) %>%
dplyr::rename(Validated=3, Not_Validated=2) %>%
mutate(percentage=(Validated/(Not_Validated+Validated)*100))
print(overlap_log_normalised_combat_no_log_weighted_validated)
overlap_DM_log_normalised_combat_log_fold_change_no_log_weighted_validated <- overlap_DM_log_normalised_combat_log_fold_change_no_log_weighted %>%
dplyr::group_by(condition) %>%
dplyr::count(threshold_25 =="|25%|") %>%
dplyr::rename(threshold_25=2) %>%
pivot_wider(names_from=threshold_25, values_from=n) %>%
dplyr::rename(Validated=3, Not_Validated=2) %>%
mutate(percentage=(Validated/(Not_Validated+Validated)*100))
print(overlap_DM_log_normalised_combat_log_fold_change_no_log_weighted_validated)
p50 <- overlap_log_normalised_combat_no_log_weighted %>%
ggplot(aes(x = weighted.meth.diff.targeted, y = weighted.meth.diff.WGBS, colour=threshold_25)) +
geom_point(shape = 16, alpha=0.6, size=2) +
geom_vline(xintercept = c(-25, 25), linetype="dotted")+
geom_hline(yintercept = c(-25, 25), linetype="dotted")+
ylab("Weighted Mean Methylation Difference WGBS") +
xlab("Weighted Mean Methylation Difference Targeted Sequencing") +
labs(title = "All Common Tiles", colour="Threshold") +
stat_cor(method="pearson")+
xlim(-90, 90) +
ylim(-90, 90) +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1) +
scale_color_manual(values=c("#919494", "#7CAE00"))+
facet_grid(cols = vars(condition))
p51 <- overlap_DM_log_normalised_combat_log_fold_change_no_log_weighted %>%
ggplot(aes(x = weighted.meth.diff.targeted, y = weighted.meth.diff.WGBS, colour=threshold_25)) +
geom_point(shape = 16, alpha=0.6, size=2) +
geom_vline(xintercept = c(-25, 25), linetype="dotted")+
geom_hline(yintercept = c(-25, 25), linetype="dotted")+
ylab("Weighted Mean Methylation Difference WGBS") +
xlab("Weighted Mean Methylation Difference Targeted Sequencing") +
labs(title = "DM Tiles", colour="Threshold") +
stat_cor(method="pearson")+
xlim(-90, 90) +
ylim(-90, 90) +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1) +
scale_color_manual(values=c("#919494", "#7CAE00"))+
facet_grid(cols = vars(condition))
annotate_figure(ggarrange(p50, p51 , nrow=2, ncol=1, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("Log-Backtransformed, Quantile Normalised and ComBat", face = "bold", size=16))
#Add qvalue to dataframe and calculate if threshold for qvalue below 0.01
overlap_DM_log_normalised_combat_log_fold_change_no_log_weighted_qvalues <- overlap_DM_log_normalised_combat_log_fold_change_no_log_weighted %>%
tidyr::unite(tiles, tiles, condition, sep=".") %>%
left_join(dplyr::select(bind_rows(methylDiff_validation), tiles, qvalue)) %>%
dplyr::rename(qvalue_targeted=qvalue) %>%
left_join(dplyr::select(bind_rows(methylDiff_WGBS), tiles, qvalue)) %>%
dplyr::rename(qvalue_WGBS=qvalue) %>%
tidyr::separate(tiles, into=c("chrom", "start", "end", "condition")) %>%
dplyr::mutate(threshold_25_qvalue= case_when(threshold_25 =="|25%|" & qvalue_targeted <=0.01 & qvalue_WGBS <0.01 ~ "significant", TRUE ~ "insignificant"))
Joining, by = "tiles"
Joining, by = "tiles"
p51 <- overlap_DM_log_normalised_combat_log_fold_change_no_log_weighted_qvalues %>%
ggplot(aes(x = weighted.meth.diff.targeted, y = weighted.meth.diff.WGBS, colour=threshold_25_qvalue)) +
geom_point(shape = 16, size=2, alpha=0.6) +
geom_vline(xintercept = c(-25, 25), linetype="dotted")+
geom_hline(yintercept = c(-25, 25), linetype="dotted")+
ylab("Weighted Mean Methylation Difference WGBS") +
xlab("Weighted Mean Methylation Difference Targeted Sequencing") +
labs(title = "DM Tiles", colour="Threshold") +
stat_cor(method="pearson")+
xlim(-90, 90) +
ylim(-90, 90) +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1) +
scale_color_manual(values=c("#919494", "#7CAE00"))+
facet_grid(cols = vars(condition))
p51
long_df_to_matrix <- function(df) {
df %>%
dplyr::select(1:3) %>%
pivot_wider(names_from = sample_ids, values_from = log_transf_perc_meth) %>%
column_to_rownames("tiles") %>%
as.matrix() %>%
return()
}
add_metadata_matrix <- function(mx, metadata_df) {
mx %>%
as_tibble(rownames = "tiles") %>%
pivot_longer(-tiles, names_to = "sample_ids", values_to = "log_transf_perc_meth") %>%
left_join(dplyr::select(metadata, sample_ids, seq_method, condition)) %>%
pivot_wider(names_from = tiles, values_from = log_transf_perc_meth) %>%
return()
}
overlap_grouped_log_combat <- overlap_grouped_log %>%
map(~ long_df_to_matrix(.) %>%
sva::ComBat(dat = ., batch = batch[colnames(.)]) %>%
add_metadata_matrix(., metadata_df = metadata))
Found 165 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
Found2batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data
Joining, by = "sample_ids"
Found 119 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
Found2batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data
Joining, by = "sample_ids"
Found 54 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
Found2batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data
Joining, by = "sample_ids"
Found 191 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
Found2batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data
Joining, by = "sample_ids"
overlap_grouped_log_combat_pca_plot <- overlap_grouped_log_combat %>%
map2(
names(.),
~ autoplot(prcomp(dplyr::select(.x, starts_with("chr"))), data = .x, colour = "seq_method") +
labs(title = .y, colour="Sequencing \nmethod") +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1)
)
annotate_figure(ggarrange(plotlist = overlap_grouped_log_combat_pca_plot , nrow=2, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("ComBat in Log Space\nAll Common Tiles", face = "bold", size=16))
overlap_log_combat <- overlap_log %>%
long_df_to_matrix() %>%
ComBat(dat = ., batch = batch[colnames(.)]) %>%
add_metadata_matrix(metadata_df = metadata)
overlap_log_combat <- overlap_log %>%
long_df_to_matrix() %>%
ComBat(dat = ., batch = batch[colnames(.)]) %>%
add_metadata_matrix(metadata_df = metadata)
Found 14 genes with uniform expression within a single batch (all zeros); these will not be adjusted for batch.
Found2batches
Adjusting for0covariate(s) or covariate level(s)
Standardizing Data across genes
Fitting L/S model and finding priors
Finding parametric adjustments
Adjusting the Data
Joining, by = "sample_ids"
overlap_DM_grouped_log_combat <-overlap_grouped_log_combat %>%
map(~ pivot_longer(., -c(sample_ids, condition, seq_method), names_to = "tiles", values_to = "log_transf_perc_meth")%>%
tidyr::unite(tiles, tiles, condition, sep=".") %>%
dplyr::filter(tiles %in% c(diff_meth_tiles)) %>%
tidyr::separate(tiles, into = c("chrom", "start", "end", "condition"), sep = "\\.") %>%
tidyr::unite(tiles, chrom, start, end, sep=".") %>%
pivot_wider(names_from = tiles, values_from = log_transf_perc_meth))
overlap_DM_grouped_log_combat<-overlap_DM_grouped_log_combat[2:4]
overlap_DM_grouped_log_combat_pca_plot <- overlap_DM_grouped_log_combat%>%
map2(
names(.),
~ autoplot(prcomp(dplyr::select(.x, starts_with("chr"))), data = .x, colour = "seq_method") +
labs(title = .y, colour="Sequencing \nmethod") +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1)
)
annotate_figure(ggarrange(plotlist = overlap_DM_grouped_log_combat_pca_plot , nrow=2, ncol=2, labels="AUTO", common.legend = TRUE, legend="right"), text_grob("ComBat in Log Space\nDM Tiles", face = "bold", size=16))
overlap_log_combat_log_fold_change <- overlap_log %>% calc_log_fold_change()
overlap_DM_log_combat_log_fold_change <- overlap_log_combat_log_fold_change %>%
tidyr::unite(tiles, tiles, condition, sep = ".") %>%
dplyr::filter(tiles %in% c(diff_meth_tiles)) %>%
tidyr::separate(tiles, into = c("chrom", "start", "end", "condition"), sep = "\\.")
# Plot
p38 <- overlap_DM_log_combat_log_fold_change %>% ggplot(aes(x = log_fold_change_targeted, y = log_fold_change_WGBS)) +
geom_point(colour = "#7CAE00", shape = 16, size=2, alpha=0.6) +
ylab("log fold change WGBS") +
xlab("log fold change Targeted Sequencing") +
labs(title = "ComBat in Log Space\nAll Common Tiles") +
stat_cor(method="pearson")+
ylim(-5, 5) +
xlim(-5, +5) +
theme(plot.title = element_text(hjust = 0.5, face="bold"), aspect.ratio = 1) +
facet_grid(cols = vars(condition))
p38
data <- list("Log" = overlap_grouped_log_pca, "ComBat"=overlap_grouped_log_combat, "Quantile Normalisation \n+ ComBat" = overlap_grouped_log_normalised_combat_pca) %>%
map(bind_rows, .id = "condition2") %>%
map(~ bind_cols(
dplyr::select(.x, !starts_with("chr")), prcomp(dplyr::select(.x, starts_with("chr")))$x
)) %>%
bind_rows(.id = "normalization") %>%
mutate(normalization = factor(normalization, levels = c( "Log", "ComBat", "Quantile Normalisation \n+ ComBat")))
ggplot(data, aes(PC1, PC2, colour = condition, shape=seq_method)) +
geom_point(size=2) +
facet_grid(normalization ~ condition2, margins = "condition2", scales = "free") +
labs(shape="Sequencing method", colour = "Condition" ) +
theme_bw() +
theme(aspect.ratio = 1)+
scale_x_continuous(sec.axis = sec_axis(~ . , name = "All Common Tiles", breaks = NULL, labels = NULL))
ggsave(bg ="white",
"/local/AAkalin_cardiac/Results/cardiac/Plots/FigureS5.pdf",
device = "pdf",
plot = figureS5
)
Saving 7 x 7 in image